
library(tidyverse)
library(quanteda)
library(conText)
library(ggpubr)
library(patchwork)


# All this at the start is just making the Fox dataset

# # Load data from dataverse
# fox_text <- read_csv("foxnews_text.csv")
# fox_trans <- read_csv("foxnews-transcript.csv")
# # Check it out
# glimpse(fox_text)
# 
# # Reorganize data (rotate 90 degrees)
# fox_text_long <- fox_text %>%
#   gather(key = "html_file", value = "text")
# # Optionally, you can rename the column values to match the original column names
# fox_text_long$html_file <- gsub("file", "html_file_", fox_text_long$html_file)
# # Collapse transcripts into single text column
# fox_text_collapsed <- fox_text_long %>%
#   group_by(html_file) %>%
#   summarize(text = paste(text, collapse = " "))
# 
# # Remove .html from trans file so that it matches the text file
# fox_trans <- fox_trans %>%
#   mutate(html_file = gsub(".html", "", html_file))
# # Join the two dataframes by "html_file"
# fox_transcripts <- inner_join(fox_trans, fox_text_collapsed, by = "html_file")
# 
# # Make year and month columns
# library(lubridate)
# fox_transcripts <- fox_transcripts %>%
#   mutate(year = year(ymd_hms(publicationDate)))
# fox_transcripts <- fox_transcripts %>%
#   mutate(month = month(ymd_hms(publicationDate)))
# 
# # Select relevant data columns
# fox_transcripts <- fox_transcripts %>%
#   select(title, imageUrl, url, publicationDate, category, isBreaking, isLive, html_file, year, month, text)
# 
# # Write CSV for later use
# write_csv(fox_transcripts, file = "fox_transcripts_full.csv")
# 
# 


# ------------------------------------------------------------------------------
# # Load YouTube data
# load("ytvideo_data_jesse.Rdata") #video_meta2
# # Load Fox Data
# fox <- read_csv("fox_transcripts_full.csv")
# # Load Stormfront data
# sf <- read_csv("stormfront_PNAS.csv")
# 
# 
# 
# # Get Year for youtube content
# # Assuming your dataframe is named df
# video_meta2 <- video_meta2 %>%
#   mutate(
#     date_obj = as.POSIXct(date_posted, origin="1970-01-01"),
#     year = format(date_obj, "%Y")
#   ) %>%
#   select(-date_obj) # Optional: remove the intermediate date_obj column
# video_meta2 <- video_meta2 %>%
#   filter(!is.na(clean_content) & clean_content != "")
# video_meta2$year <- as.numeric(video_meta2$year)
# youtube <- video_meta2
# 
# # Calculate average lengths of entries
# string_lengths_s <- nchar(sf$text)
# # Calculate the average string length
# average_length_s <- mean(string_lengths_s, na.rm = TRUE)
# # Print the result
# print(average_length_s) # (STORMFRONT = 460 characters per entry)
# 
# string_lengths_f <- nchar(fox$text)
# # Calculate the average string length
# average_length_f <- mean(string_lengths_f, na.rm = TRUE)
# # Print the result
# print(average_length_f) # (FOX = 21517 characters per entry)
# 
# string_lengths_y <- nchar(youtube$clean_content)
# # Calculate the average string length
# average_length_y <- mean(string_lengths_y, na.rm = TRUE)
# # Print the result
# print(average_length_y) # (YOUTUBE = 22595 characters per entry)
# # This means that each row is relatively balanced between Fox and Youtube, but
# # that Stormfront has about 50 (47.8) times smaller posts per row.
# 
# #get some basic info about the datasets
# table(video_meta2$year)
# table(fox$year)
# table(sf$year)
# 
# # Assuming 'sf' is your data frame and 'year' is the column with numerical data
# # Create the histogram using ggplot2
# s <- ggplot(sf, aes(x=year)) +
#   geom_histogram(binwidth = 1, fill = "grey", color = "black") +  # Customize bins, colors
#   theme_minimal() +  # Use a minimal theme for a professional look
#   labs(title = "Stormfront Posts by Year",
#        x = "Year",
#        y = "Number of Posts") +  # Add labels
#   theme(text = element_text(size = 12),  # Adjust text size
#         plot.title = element_text(hjust = 0.5),  # Center the plot title
#         axis.title = element_text(size = 14),  # Adjust axis title size
#         axis.text = element_text(size = 12)) + # Adjust axis text size
#   xlim(2005, 2020)
# # Create the histogram using ggplot2
# f <- ggplot(fox, aes(x=year)) +
#   geom_histogram(binwidth = 1, fill = "blue", color = "black") +  # Customize bins, colors
#   theme_minimal() +  # Use a minimal theme for a professional look
#   labs(title = "Fox News Transcripts by Year",
#        x = "Year",
#        y = "Number of Transcripts") +  # Add labels
#   theme(text = element_text(size = 12),  # Adjust text size
#         plot.title = element_text(hjust = 0.5),  # Center the plot title
#         axis.title = element_text(size = 14),  # Adjust axis title size
#         axis.text = element_text(size = 12)) + # Adjust axis text size
#   xlim(2005, 2020)
# # Create the histogram using ggplot2
# y <- ggplot(youtube, aes(x=year)) +
#   geom_histogram(binwidth = 1, fill = "red", color = "black") +  # Customize bins, colors
#   theme_minimal() +  # Use a minimal theme for a professional look
#   labs(title = "Youtube Videos by Year",
#        x = "Year",
#        y = "Number of Videos") +  # Add labels
#   theme(text = element_text(size = 12),  # Adjust text size
#         plot.title = element_text(hjust = 0.5),  # Center the plot title
#         axis.title = element_text(size = 14),  # Adjust axis title size
#         axis.text = element_text(size = 12)) + # Adjust axis text size
#   xlim(2005, 2020)
# print(s)
# print(f)
# print(y)
# ggarrange(s, f, y, ncol = 3, nrow = 1)
# 
# 
# # SSSAAAMMMMPLLLLEEEE 11111111
# # This sample was used in making the local embeddings and transformations
# set.seed(12345)
# # Take samples from each
# sample_size_fox <- 100
# fox_sample_1 <- fox %>%
#   mutate(source = "Fox") %>%
#   select(text, year, source) %>%
#   group_by(year) %>%
#   sample_n(sample_size_fox) %>%
#   ungroup()
# sample_size_sf <- 4000
# sf_sample1 <- sf %>%
#   filter(forumSection %in% c("Ideology and Philosophy", "Newslinks & Articles", "Politics & Continuing Crises")) %>%
#   mutate(source = "Stormfront") %>%
#   select(text, year, source) %>%
#   group_by(year) %>%
#   sample_n(sample_size_sf) %>%
#   ungroup()
# sample_size_yt <- 200
# yt_sample <- video_meta2 %>%
#   mutate(source = "Youtube AIN") %>%
#   select(clean_content, year, source) %>%
#   group_by(year) %>%
#   filter(n() >= sample_size_yt) %>%
#   sample_n(sample_size_yt) %>%
#   ungroup()
# colnames(yt_sample)[colnames(yt_sample) == "clean_content"] <- "text"
# yt_sample$year <- as.numeric(yt_sample$year)
# # See text by year
# table(sf_sample1$year)
# table(fox_sample_1$year)
# table(yt_sample$year)
# 
# # Apply the word count calculation to each element in the text column
# word_counts <- sapply(yt_sample$text, function(x) length(unlist(strsplit(as.character(x), "\\s+"))))
# 
# # Optionally, if you want to know the total number of words across all rows
# total_words <- sum(word_counts)
# print(total_words)
# # Fox has 4058126 (n=50/y) total words and Stormfront 3776002 (n=2000/y) total words, which is
# # relatively balanced for now. and 3314231 (n=100/y) from AIN
# 
# # FOR NOW, I'VE GOTTEN RID OF THE YOUTUBE DATA *******************
# # Combine the samples into one dataframe
# sample1 <- bind_rows(fox_sample_1, sf_sample1)
# sample1 <- sample1 %>%
#   filter(!is.na(text) & text != "")
# 
# # Creating the 'trump' column
# sample1 <- sample1 %>%
#   mutate(trump = ifelse(year >= 2017, 1, 0)) %>%
#   mutate(election_ctrl = ifelse(year >= 2014, 1, 0)) %>%
#   mutate(election_test = ifelse(year >= 2015, 1, 0))
# 
# glimpse(sample1)
# 
# write_csv(sample1, "fox_stormfront_sample.csv")

# #----------------------
# # conText
# # Make objects for conText and quanteda
# sample1$text <- tolower(sample1$text)
# sample1_corpus <- corpus(sample1, text_field = "text")
# sample1_tokens <- tokens(sample1_corpus,
#                          remove_punct=T,
#                          remove_symbols=T,
#                          remove_numbers=T,
#                          remove_separators=T) %>%
#   tokens_remove(pattern = stopwords("en")) %>%
#   tokens_remove(pattern = c("like", "know", "just", "think", "one", "can", "get", "say", "said",
#                             "yeah", "mean", "look", "lot", "dont", "take", "video", "youtube", "na", "--")) %>%
#   tokens_select(min_nchar = 3)
# sample1_dfm <- dfm(sample1_tokens)
# 
# # Save quanteda items as
# save(sample1_corpus, file = "sample1_corpus.RData")
# save(sample1_tokens, file = "sample1_tokens.RData")
# save(sample1_dfm, file = "sample1_dfm.RData")

load("sample1_corpus.RData")
load("sample1_tokens.RData")
load("sample1_dfm.RData")

topfeatures(sample1_dfm, n = 50)

# Now, move into the language of conText
# Only use features that appear at least 150 times in the corpus
feats_sample <- dfm(sample1_tokens, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 10) %>% featnames()

# leave the pads so that non-adjacent words will not become adjacent
toks_nostop_feats_sample <- tokens_select(sample1_tokens, feats_sample, padding = TRUE)

# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------
# # SSSAAAMMMMPLLLLEEEE 2222222
# # This generates an overall dataset, and includes YT content creators
# set.seed(12345)
# # Take samples from each
# 
# fox_sample_2 <- fox %>%
#   mutate(source = "Fox") %>%
#   mutate(creator = source) %>%
#   select(text, year, source, creator)
# 
# sf_sample_2 <- sf %>%
#   filter(forumSection %in% c("Ideology and Philosophy", "Newslinks & Articles", "Politics & Continuing Crises")) %>%
#   mutate(source = "Stormfront") %>%
#   mutate(creator = source) %>%
#   select(text, year, source, creator)
# 
# yt_sample_2 <- youtube %>%
#   mutate(source = "Youtube AIN") %>%
#   mutate(creator = video_op) %>%
#   select(clean_content, year, source, creator)
# 
# colnames(yt_sample_2)[colnames(yt_sample_2) == "clean_content"] <- "text"
# yt_sample_2$year <- as.numeric(yt_sample_2$year)
# 
# 
# # FOR NOW, I'VE GOTTEN RID OF THE YOUTUBE DATA *******************
# # Combine the samples into one dataframe
# sample2 <- bind_rows(fox_sample_2, sf_sample_2, yt_sample_2)
# sample2 <- sample2 %>%
#   filter(!is.na(text) & text != "")
# 
# 
# # Creating the 'trump' column
# sample2 <- sample2 %>%
#   mutate(trump = ifelse(year >= 2017, 1, 0))
# 
# # conText
# # Make objects for conText and quanteda
# sample2$text <- tolower(sample2$text)
# sample2_corpus <- corpus(sample2, text_field = "text")
# sample2_tokens <- tokens(sample2_corpus,
#                           remove_punct=T,
#                           remove_symbols=T,
#                           remove_numbers=T,
#                           remove_separators=T) %>%
#   tokens_remove(pattern = stopwords("en")) %>%
#   tokens_remove(pattern = c("like", "know", "just", "think", "one", "can", "get", "say", "said",
#                             "yeah", "mean", "look", "lot", "dont", "take", "video", "youtube", "na", "--")) %>%
#   tokens_select(min_nchar = 3)
# sample2_dfm <- dfm(sample2_tokens)
# 
# # Save quanteda items as
# save(sample2_corpus, file = "sample2_corpus.RData")
# save(sample2_tokens, file = "sample2_tokens.RData")
# save(sample2_dfm, file = "sample2_dfm.RData")

# load("sample2_corpus.RData")
# load("sample2_tokens.RData")
# load("sample2_dfm.RData")
# 
# topfeatures(sample2_dfm, n = 50)
# 
# # Now, move into the language of conText
# # Only use features that appear at least 150 times in the corpus
# feats_sample2 <- dfm(sample2_tokens, tolower=T, verbose = FALSE) %>% dfm_trim(min_termfreq = 200) %>% featnames()
# 
# # leave the pads so that non-adjacent words will not become adjacent
# toks_nostop_feats_sample2 <- tokens_select(sample2_tokens, feats_sample2, padding = TRUE)


# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------
# -------------------------














# make localized glove model for scientific vanguard
library(text2vec)

# #---------------------
# # estimate glove model
# #----------------------
# 
# # construct the feature co-occurrence matrix for our toks_nostop_feats object (see above)
# toks_fcm_sample <- fcm(toks_nostop_feats_sample, context = "window", window = 6, count = "frequency", tri = FALSE) # important to set tri = FALSE
# 
# # estimate glove model using text2vec
# glove <- GlobalVectors$new(rank = 300,
#                            x_max = 10,
#                            learning_rate = 0.05)
# wv_main <- glove$fit_transform(toks_fcm_sample, n_iter = 10,
#                                convergence_tol = 1e-3,
#                                n_threads = 2) # set to 'parallel::detectCores()' to use all available cores
# 
# wv_context <- glove$components
# local_glove_sample <- wv_main + t(wv_context) # word vectors
# 
# # qualitative check
# #find_nns(local_glove_sample['jews',], pre_trained = local_gloveV, N = 5, candidates = featsV)
# 
# # now, TRANSFORMATION MATRIX
# # compute transform
# # weighting = 'log' works well for smaller corpora
# # for large corpora use a numeric value e.g. weighting = 500
# # see: https://arxiv.org/pdf/1805.05388.pdf
# local_transform_sample <- compute_transform(x = toks_fcm_sample, pre_trained = local_glove_sample, weighting = 'log')
# local_transform_sample_alt <- compute_transform(x = toks_fcm_sample, pre_trained = cr_glove_subset, weighting = 'log')
# 
# # Now we save both the local_glove_SF and local_transform_SF as Rdata
# # save the matrix as an .RData file
# save(local_glove_sample, file = "local_glove_sample.RData")
# save(local_transform_sample, file = "local_transform_sample.RData")
# 
# # Load local matrix files for Stormfront data
# load("local_glove_sample.RData")
# load("local_transform_sample.RData")
# 
# 
# 
# #---------------------
# # estimate ANOTHER glove model
# #----------------------
# 
# # construct the feature co-occurrence matrix for our toks_nostop_feats object (see above)
# toks_fcm_sample2 <- fcm(toks_nostop_feats_sample, context = "window", window = 6, count = "frequency", tri = FALSE) # important to set tri = FALSE
# 
# # estimate glove model using text2vec
# glove <- GlobalVectors$new(rank = 300,
#                            x_max = 10,
#                            learning_rate = 0.05)
# wv_main <- glove$fit_transform(toks_fcm_sample2, n_iter = 10,
#                                convergence_tol = 1e-3,
#                                n_threads = 2) # set to 'parallel::detectCores()' to use all available cores
# 
# wv_context <- glove$components
# local_glove_sample2 <- wv_main + t(wv_context) # word vectors
# 
# # qualitative check
# #find_nns(local_glove_sample['jews',], pre_trained = local_gloveV, N = 5, candidates = featsV)
# 
# # now, TRANSFORMATION MATRIX
# # compute transform
# # weighting = 'log' works well for smaller corpora
# # for large corpora use a numeric value e.g. weighting = 500
# # see: https://arxiv.org/pdf/1805.05388.pdf
# local_transform_sample2 <- compute_transform(x = toks_fcm_sample2, pre_trained = local_glove_sample2, weighting = 'log')
# 
# # Now we save both the local_glove_SF and local_transform_SF as Rdata
# # save the matrix as an .RData file
# save(local_glove_sample2, file = "local_glove_sample2.RData")
# save(local_transform_sample2, file = "local_transform_sample2.RData")

# Load local matrix files for Stormfront data
load("local_glove_sample2.RData")
load("local_transform_sample2.RData")
























# SWITCHED TO SF YOUTUBE COMPARISION, BUT HAVENT CHANGED EVERYTHING BELOW YET
# OVERALL DISCOURSE CHANGE COMPARISONS
# Start by looking at difference before and after Trump
# create feature co-occurrence matrix for each party (set tri = FALSE to work with fem)
fcm_fox <- fcm(toks_nostop_feats_sample2[docvars(toks_nostop_feats_sample2, 'source') == "Fox",], context = "window", window = 6, count = "frequency", tri = FALSE)
fcm_sf <- fcm(toks_nostop_feats_sample2[docvars(toks_nostop_feats_sample2, 'source') == "Stormfront",], context = "window", window = 6, count = "frequency", tri = FALSE)
fcm_yt <- fcm(toks_nostop_feats_sample2[docvars(toks_nostop_feats_sample2, 'source') == "Youtube AIN",], context = "window", window = 6, count = "frequency", tri = FALSE)

# compute feature-embedding matrix
fem_fox <- fem(fcm_fox, pre_trained = local_glove_sample, transform = TRUE, transform_matrix = local_transform_sample, verbose = FALSE)
fem_sf <- fem(fcm_sf, pre_trained = local_glove_sample, transform = TRUE, transform_matrix = local_transform_sample, verbose = FALSE)
fem_yt <- fem(fcm_yt, pre_trained = local_glove_sample, transform = TRUE, transform_matrix = local_transform_sample, verbose = FALSE)
fem_yt
# cr_fem will contain an embedding for each feature
fem_fox[1:5,1:3]
# compute "horizontal" cosine similarity
feat_comp_fox_sf <- feature_sim(x = fem_fox, y = fem_sf)
feat_comp_fox_yt <- feature_sim(x = fem_fox, y = fem_yt)
feat_comp_yt_sf <- feature_sim(x = fem_yt, y = fem_sf)

feat_comp_fox_sf <- feat_comp_fox_sf %>%
  rename(fox_sf_sim = value)
feat_comp_fox_yt <- feat_comp_fox_yt %>%
  rename(fox_yt_sim = value)
feat_comp_yt_sf <- feat_comp_yt_sf %>%
  rename(yt_sf_sim = value)

overall_fem <- inner_join(feat_comp_fox_sf,feat_comp_fox_yt, by = "feature")
overall_fem <- inner_join(overall_fem,feat_comp_yt_sf,by="feature")
# This is an overall horizontal cosine similarity measurement between matrices.


# build a tokenized corpus of contexts sorrounding the target term "__________"
bank_toks2 <- tokens_context(x = toks_nostop_feats_sample2, pattern = c("crime", "criminal", "violent", "violence"), window = 6L)
# we limit candidates to features in our corpus
feats2 <- featnames(dfm(bank_toks2))
# compute the cosine similarity between each group's embedding and a specific set of features
set.seed(2021L)
test_cos<-get_cos_sim(x = bank_toks2,
            groups = docvars(bank_toks2, 'source'),
            features = c("white", "black"),
            pre_trained = local_glove_sample,
            transform = TRUE,
            transform_matrix = local_transform_sample,
            bootstrap = FALSE,
            num_bootstraps = 100,
            as_list = FALSE)
print(test_cos)
test_nns<-get_nns(x = bank_toks2, N = 10,
        groups = docvars(bank_toks2, 'source'),
        candidates = feats2,
        pre_trained = local_glove_sample,
        transform = TRUE,
        transform_matrix = local_transform_sample,
        bootstrap = FALSE,
        num_bootstraps = 100, 
        confidence_level = 0.95,
        as_list = TRUE)
print(test_nns)
ratio <- get_nns_ratio(x = bank_toks, 
              N = 10,
              groups = docvars(bank_toks, 'source'),
              numerator = "Fox",
              candidates = feats,
              pre_trained = cr_glove_subset,
              transform = TRUE,
              transform_matrix = cr_transform,
              bootstrap = TRUE,
              num_bootstraps = 100,
              permute = TRUE,
              num_permutations = 100,
              verbose = FALSE)
plot_nns_ratio(x = ratio, alpha = 0.01, horizontal = FALSE)
get_ncs(x = bank_toks,
        N = 10,
        groups = docvars(bank_toks, 'source'),
        pre_trained = cr_glove_subset,
        transform = TRUE,
        transform_matrix = cr_transform,
        bootstrap = TRUE,
        num_bootstraps = 100,
        as_list = TRUE)


# EMBEDDING REGRESSIONS
# below WORKS!!!

# Initialize an empty dataframe
df_normed_coefficients <- data.frame()

# Loop through the years
for (year in 2005:2020) {
  
  # Subset the data for the given year
  data_year = toks_nostop_feats_sample2[as.data.frame(docvars(toks_nostop_feats_sample2))$year == year]
  
  # Run the model with error handling
  tryCatch({
    model2 <- conText(formula = c("white") ~ source,
                      data = data_year,
                      pre_trained = local_glove_sample,
                      transform = TRUE, transform_matrix = local_transform_sample,
                      bootstrap = FALSE, num_bootstraps = 100,
                      stratify = FALSE, jackknife = TRUE,
                      confidence_level = 0.95,
                      permute = TRUE, num_permutations = 100,
                      window = 6L, case_insensitive = TRUE,
                      hard_cut = FALSE, verbose = FALSE)
    
    # Append the year and normed_coefficients to the dataframe
    normed_coef <- model2@normed_coefficients
    normed_coef$year <- year
    df_normed_coefficients <- rbind(df_normed_coefficients, normed_coef)
  }, error = function(e) {
    # Print the error message and continue with the next iteration
    print(paste("Error in year", year, ":", e))
  })
}

# View the final dataframe
print(df_normed_coefficients)

# Function to determine the significance level
get_significance <- function(p_value){
  if (p_value < .001) return('***')
  else if (p_value < .01) return('**')
  else if (p_value < .05) return('*')
  else return('') 
}

# Create a new variable in your dataframe to hold the significance labels
df_normed_coefficients$signif <- sapply(df_normed_coefficients$p.value, get_significance)

#other plot aesthetics (with error lines instead of bars)
ggplot(df_normed_coefficients, aes(x=year, y=normed.estimate, group=coefficient, color=coefficient)) +
  geom_line(size = 1) +
  geom_point(size = 1, color = "#D55E00", shape = 21, fill = "#CC79A7") +
  geom_line(aes(x = year, y = lower.ci), color = "gray60", size = 0.5 , linetype = "dotted") +
  geom_line(aes(x = year, y = upper.ci), color = "gray60", size = 0.5, linetype = "dotted") +
  #geom_text(aes(label=signif), vjust=-1, color="black", size=4) +
  labs(x="Year", y="Norm of β(hat)", title = "Party Difference in Meaning of Term: Freedom",
       caption = "Table 1. Difference in meaning of term 'Freedom'
       between Republican and Democratic Parties
       (Significance codes:  '***' 0.001 '**' 0.01 '*' 0.05)") +
  theme_grey() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
    axis.title = element_text(face = "bold", size = 10),
    axis.text = element_text(size = 7),
    legend.position = "none"
  )



table(sample1$year)
sample1 %>%
  filter(source == "Fox") %>%  # Filter rows with source "Fox"
  group_by(year) %>%           # Group by year
  summarise(count = n())















# *******************CONSINE SIMILARITY OVER TIME*****************************

# build a tokenized corpus of contexts sorrounding the target term "__________"
bank_toks <- tokens_context(x = toks_nostop_feats_sample, pattern = c("crim*", "violen*"), window = 6L)
# we limit candidates to features in our corpus
feats <- featnames(dfm(bank_toks))

# Initialize an empty dataframe
df_cos_sim <- data.frame()

# Loop through the years
for (year in 2002:2021) {
  
  # Subset the data for the given year
  data_year <- bank_toks[as.data.frame(docvars(bank_toks))$year == year]
  
  # Run the cosine similarity function with error handling
  tryCatch({
    cos_sim <- get_cos_sim(x = data_year,
                           groups = docvars(data_year, 'source'),
                           features = c("white", "whites", "caucasian", "although"),
                           pre_trained = local_glove_sample,
                           transform = TRUE,
                           transform_matrix = local_transform_sample,
                           bootstrap = TRUE,
                           num_bootstraps = 100,
                           as_list = FALSE)
    
    # Append the year and results to the dataframe
    cos_sim$year <- year
    df_cos_sim <- rbind(df_cos_sim, cos_sim)
  }, error = function(e) {
    # Print the error message and continue with the next iteration
    print(paste("Error in year", year, ":", e))
  })
}

# View the final dataframe
print(df_cos_sim)

# Filter the dataframe to exclude 2022
df_cos_sim_filtered <- df_cos_sim %>% filter(year != 2022)

# Reordering the factor levels of 'target'
df_cos_sim_filtered$target <- factor(df_cos_sim_filtered$target, levels = c("Stormfront", "Youtube AIN", "Fox"))

# Filter the dataframe to exclude Youtube
df_cos_sim_filtered <- df_cos_sim %>% filter(target != "Youtube AIN")

# Replace control (although) with "control".
# Assuming your DataFrame is named df
df_cos_sim_filtered <- df_cos_sim_filtered %>%
  mutate(feature = as.character(feature)) %>% # Convert to character
  mutate(feature = ifelse(feature == "although", "control", feature)) # Replace values

# Get values above CONTROL for the cosine similarity measure, rather
# than gross values. This accounts for the unique focus of the target
# terms, rather than the general prevalence of the terms of interest.
# Step 1: Filter out control values for each target and year
control_values <- df_cos_sim_filtered %>%
  filter(feature == "control") %>%
  select(target, year, control_value = value)
# Step 2: Merge the control values back to the original data frame
df_cos_sim_filtered <- df_cos_sim_filtered %>%
  left_join(control_values, by = c("target", "year"))
# Step 3: Calculate the difference
df_cos_sim_filtered <- df_cos_sim_filtered %>%
  mutate(value_above_control = value - control_value)

# Then remove the CONTROL
df_cos_sim_filtered <- df_cos_sim_filtered %>%
  filter(feature != "control")

# # Plot the results (of cosine similarity ABOVE CONTROL)
# ggplot(df_cos_sim_filtered, aes(x=year, y=value_above_control, color=feature, group=feature)) +
#   geom_smooth(method = "loess", se = FALSE, size = 1) + # Smoothed lines
#   geom_point(size = 2, alpha = 0.15) + # Slightly larger points
#   labs(x="Year", y="Cosine Similarity Above Control", title = "Cosine Similarity To 'Immigration/Border' Over Time",
#        caption = "Figure 1. Change in cosine similarity for selected terms over time.") +
#   theme_dark() +
#   theme(
#     plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
#     axis.title = element_text(face = "bold", size = 12),
#     axis.text = element_text(size = 9),
#     legend.title = element_text(size = 10),
#     legend.text = element_text(size = 9),
#     legend.position = "bottom" # Move legend to bottom
#   ) +
#   scale_color_brewer(palette="RdGy") + # Different color palette
#   guides(color = guide_legend(title = "Features")) +
#   facet_wrap(~ target, scales = "fixed", ncol = 2)

# Plot the results (of cosine similarity GROSS)
crime_white_cs <- ggplot(df_cos_sim_filtered, aes(x=year, y=value, color=feature, group=feature)) +
  geom_smooth(method = "loess", se = FALSE, size = 1) + # Smoothed lines
  geom_point(size = 2, alpha = 0.15) + # Slightly larger points
  labs(x="Year", y="Cosine Similarity", title = "White") +
  theme_dark() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 14),
    axis.title = element_text(face = "bold", size = 12),
    axis.text = element_text(size = 9),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 9),
    legend.position = "bottom" # Move legend to bottom
  ) +
  scale_color_brewer(palette="RdGy") + # Different color palette
  guides(color = guide_legend(title = "Features")) +
  facet_wrap(~ target, scales = "fixed", ncol = 2)
print(crime_white_cs)

# Using the patchwork package we can stitch the plots together, we just have to
# remove all the basic labels and titles from them and reintegrate via patchwork
# language.
crime_arab_cs + crime_black_cs + crime_jew_cs + crime_latino_cs + crime_white_cs + 
  plot_annotation(title = "Cosine Similarity to Crime and Violence",
                  subtitle = NULL,
                  caption = "Cosine similarity estimation using a la carte logic
                  to crim* + violen*.",
                  tag_levels = NULL,
                  tag_prefix = NULL,
                  tag_suffix = NULL,
                  tag_sep = NULL,
                  theme = NULL)






# *******************CONSINE SIMILARITY BEFORE AND AFTER*****************************

# compute the cosine similarity between each group's embedding and a specific set of features
set.seed(2021L)
# build a tokenized corpus of contexts sorrounding the target term "__________"
test_concept <- c("immigra*", "border*") # this is the central idea
test_features <- c("arabs", "muslim", "muslims",
                   "latino", "hispanic", "mexican",
                   "black", "blacks", "negro",
                   "jew", "jewish", "jews",
                   "white", "whites", "caucasian") # these are the ideas to compare to it

bank_toks <- tokens_context(x = toks_nostop_feats_sample, pattern = test_concept, window = 6L)
# we limit candidates to features in our corpus
feats <- featnames(dfm(bank_toks))
# test set
bank_toks_test <- bank_toks
# this code throws out the years 2015 and 2016 (when the trump election is in flux)
# and thus resolves the data to include 2001-2014 (labeled as 0 for trump variable)
# and 2017-2021 (labeled as 1 for trump variable)
bank_toks_test <- bank_toks_test[!as.data.frame(docvars(bank_toks_test))$year %in% c(2015, 2016)]
# then balance the two sides by five years 2010-2014 and 2017-2021
bank_toks_test <- bank_toks_test[as.data.frame(docvars(bank_toks_test))$year >= 2010] 


data_fox <- bank_toks_test[as.data.frame(docvars(bank_toks_test))$source == "Fox"]
data_fox_dfm <- dfm(data_fox)
topfeatures(data_fox_dfm, n = 50)
data_sf <- bank_toks_test[as.data.frame(docvars(bank_toks_test))$source == "Stormfront"]
data_sf_dfm <- dfm(data_sf)
topfeatures(data_sf_dfm, n = 50)



test_cos <- data.frame() #reset empty dataframe to load new data into
test_cos1 <- get_cos_sim(x = data_fox,
            groups = docvars(data_fox, 'trump'),
            features = test_features,
            pre_trained = local_glove_sample,
            transform = TRUE,
            transform_matrix = local_transform_sample,
            bootstrap = TRUE,
            num_bootstraps = 100,
            as_list = FALSE)
print(test_cos1)
test_cos1 <- test_cos1 %>%
  mutate(source = "Fox")
test_cos2 <- get_cos_sim(x = data_sf,
                        groups = docvars(data_sf, 'trump'),
                        features = test_features,
                        pre_trained = local_glove_sample,
                        transform = TRUE,
                        transform_matrix = local_transform_sample,
                        bootstrap = TRUE,
                        num_bootstraps = 100,
                        as_list = FALSE)
print(test_cos2)
test_cos2 <- test_cos2 %>%
  mutate(source = "Stormfront")
test_cos <- bind_rows(test_cos1, test_cos2)
print(test_cos)

# Relabel as before and after trump
test_cos$target <- factor(test_cos$target, levels = c(0,1), labels = c("Before Trump", "After Trump"))

# Assuming 'test_cos' is your data frame
test_cos <- test_cos %>% 
  rename(Treatment = target)

# order terms
custom_order <- c("arabs", "muslim", "muslims",
                  "latino", "hispanic", "mexican",
                  "black", "blacks", "negro",
                  "jew", "jewish", "jews",
                  "white", "whites", "caucasian")  # replace with your actual feature names
test_cos$feature <- factor(test_cos$feature, levels = custom_order)

# Update the 'source' column to replace "Fox" with "Fox News"
test_cos$source <- gsub("Fox", "Fox News", test_cos$source)

# Add chunks
test_cos$chunk_group <- ceiling(as.numeric(factor(test_cos$feature)) / 3)

# Creating the plot with a professional look
plot <- ggplot(test_cos, aes(x = feature, y = value, color = Treatment, group = interaction(source, Treatment))) +
  geom_point(position = position_dodge(width = 0.3)) +
  geom_errorbar(aes(ymin = lower.ci, ymax = upper.ci), width = 0.2, position = position_dodge(width = 0.3)) +
  facet_wrap(~source, ncol = 1) +
  theme_minimal(base_size = 14) +  # Increase base font size for better readability
  geom_vline(xintercept = seq(0.5, max(as.numeric(factor(test_cos$feature))), by = 3), color = "grey80", linetype = "dotted") +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text.y = element_text(size = 8),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.background = element_rect(color = "darkgrey", linewidth = 0.25, linetype = "solid"),  # Adds a border
    strip.text = element_text(size = 12), # For facet labels
    plot.caption = element_text(size = 8, hjust = 0),
    plot.caption.position = "panel"
  ) +
  labs(
    #title = "Pre-Post Analysis of Trump's Election",
    x = "Term",
    y = "Cosine Similarity to 'Immigration and Border'",
    caption = "NOTE: The 'Before Trump' treatment constitutes a sample of data from 2010-2014,
    while the 'After Trump' treatment is made up of a sample of text from 2017-2021. Cosine
    Similarity is to the terms 'immigra*' and 'border*'."  # Replace with your desired caption
  ) +
  scale_color_manual(values = c("Before Trump" = "darkblue", "After Trump" = "darkgoldenrod1")) +
  coord_flip()

# To display the plot
print(plot)
















# Basic NEAREST CONTEXTS over time ******************
bank_toks <- tokens_context(x = toks_nostop_feats_sample, pattern = "media", window = 6L)
bank_dfm <- dfm(bank_toks)
bank_dem <- dem(x = bank_dfm, pre_trained = local_glove_sample, transform = TRUE, transform_matrix = local_transform_sample, verbose = TRUE)
bank_wv <- matrix(colMeans(bank_dem), ncol = ncol(bank_dem)) %>%  `rownames<-`("media")
bank_wv_source <- dem_group(bank_dem, groups = bank_dem@docvars$source)
# compute the cosine similarity between each party's embedding and a set of tokenized contexts
bank_ncs <- ncs(x = bank_wv_source, contexts_dem = bank_dem, contexts = bank_toks, N = 10, as_list = TRUE)
# nearest contexts to Republican embedding of target term
# note, these may included contexts originating from Democrat speakers
bank_ncs[["Fox"]]
bank_ncs[["Stormfront"]]

# Nearest Contexts for JUST FOX NEWS
bank_toks <- tokens_context(x = toks_nostop_feats_sample, pattern = c("latino", "hispanic", "mexican"), window = 6L)
# Select just for Fox
fox_toks <- bank_toks[as.data.frame(docvars(bank_toks))$source == "Fox"]
fox_dfm <- dfm(fox_toks)
fox_dem <- dem(x = fox_dfm, pre_trained = local_glove_sample2, transform = TRUE, transform_matrix = local_transform_sample2, verbose = TRUE)
fox_wv <- matrix(colMeans(fox_dem), ncol = ncol(fox_dem)) %>%  `rownames<-`("latino")
fox_wv_trump <- dem_group(fox_dem, groups = fox_dem@docvars$election_test)
# compute the cosine similarity between each party's embedding and a set of tokenized contexts
fox_ncs <- ncs(x = fox_wv_trump, contexts_dem = fox_dem, contexts = fox_toks, N = 10, as_list = TRUE)
# nearest contexts to Republican embedding of target term
# note, these may included contexts originating from Democrat speakers
View(fox_ncs[["1"]])
View(fox_ncs[["0"]])

# Nearest Contexts for JUST STORMFRONT
bank_toks <- tokens_context(x = toks_nostop_feats_sample, pattern = c("latino", "hispanic", "mexican"), window = 6L)
# Select just for Fox
sf_toks <- bank_toks[as.data.frame(docvars(bank_toks))$source == "Stormfront"]
sf_dfm <- dfm(sf_toks)
sf_dem <- dem(x = sf_dfm, pre_trained = local_glove_sample2, transform = TRUE, transform_matrix = local_transform_sample2, verbose = TRUE)
sf_wv <- matrix(colMeans(sf_dem), ncol = ncol(sf_dem)) %>%  `rownames<-`("latino")
sf_wv_trump <- dem_group(sf_dem, groups = sf_dem@docvars$election_test)
# compute the cosine similarity between each party's embedding and a set of tokenized contexts
sf_ncs <- ncs(x = sf_wv_trump, contexts_dem = sf_dem, contexts = sf_toks, N = 10, as_list = TRUE)
# nearest contexts to Republican embedding of target term
# note, these may included contexts originating from Democrat speakers
View(sf_ncs[["1"]])
View(sf_ncs[["0"]])



# Now after running that, you can use the wrapper function?
# compare nearest neighbors between groups
set.seed(2021L)
immig_party_ncs <- get_ncs(x = bank_toks,
                           N = 10,
                           groups = docvars(bank_toks, 'source'),
                           pre_trained = local_glove_sample,
                           transform = TRUE,
                           transform_matrix = local_transform_sample,
                           bootstrap = TRUE,
                           num_bootstraps = 100,
                           as_list = TRUE)
print(immig_party_ncs$'Fox'$context)
print(immig_party_ncs$'Stormfront'$context)







# Basic Nearest Neighbors over time ******************

# compare nearest neighbors between groups

# build a tokenized corpus of contexts sorrounding the target term "__________"
nns_toks <- tokens_context(x = toks_nostop_feats_sample, pattern = c("immigra*", "border*"), window = 6L)
# we limit candidates to features in our corpus
nns_feats <- featnames(dfm(nns_toks))

nns <- get_nns(x = nns_toks, N = 20,
        groups = docvars(nns_toks, 'source'),
        candidates = nns_feats,
        pre_trained = local_glove_sample,
        transform = TRUE,
        transform_matrix = local_transform_sample,
        bootstrap = FALSE,
        num_bootstraps = 100, 
        confidence_level = 0.95,
        as_list = TRUE)
print(nns)
f <- nns[["Fox"]]
s <- nns[["Stormfront"]]
y <- nns[["Youtube AIN"]]
nns_race <- bind_rows(f, s, y) # change tag for each policy
# I just sort of don't understand, still, how to make tables in R
# that don't look like a fucking toddler made them.
policy_nns <- read_csv("Nearest_Neighbors_and_Policy_Domains.csv")
policy_nns<-policy_nns[c("policy", "Fox News", "Youtube Alternative Influence Network", "Stormfront")]
racial_nns <- read_csv("nearest_neighbors_racial_categories.csv")
racial_nns<-racial_nns[c("...1", "Fox", "Youtube AIN", "Stormfront")]
broad_nns <- read_csv("nearest_neighbors_politics_race_full_features.csv")
broad_nns<-broad_nns[c("...1", "Fox", "Youtube AIN", "Stormfront")]

library(gt)
info_google_fonts()
# Below makes a table of Sources with 20 nearest neighbors to policy categories.
gt(policy_nns) %>%
  tab_header(
    title = md("**Table 1.** Nearest Neighbors: Policy Domains"),
    #subtitle = "hello world"
  ) %>%
  tab_source_note(
    source_note = md("**Note:** Top 20 nearest neighbors by cosine similarity displayed,
    in order of proximity.
    Crime vector constituted by crim* + violen* vectors.
    Economy vector constituted by bank* + financ* + econom* vectors. Health vector
    constituted by health* vector. Immigration vector constituted by immigra* + border*
    vectors. Media vector constituted by media* + speech + censor* vectors. Polity
    vector constituted by nation* + country* vectors.")
  ) %>%
  cols_label(
    policy = "Policy Domain",
    'Fox News' = "Fox News",
    Stormfront = "Stormfront",
    'Youtube Alternative Influence Network' = "Youtube AIN"
  ) %>%
  tab_options(table.font.size = px(15),
              source_notes.font.size = px(10),
              column_labels.font.size = px(18),
              heading.title.font.size = (20)) %>%
  opt_table_font(
    font = list(
      google_font(name = "Spectral")
    )
  )

# Below makes a table of Sources with 20 nearest neighbors to racial categories.
gt(racial_nns) %>%
  tab_header(
    title = md("**Table 2.** Nearest Neighbors: Racial and Ethnic Categories"),
    #subtitle = "hello world"
  ) %>%
  tab_source_note(
    source_note = md("**Note:** Top 20 nearest neighbors by cosine similarity displayed,
    in order of proximity.
    Arab vector constituted by arab* + muslim* vectors.
    Black vector constituted by black* + negro* vectors. Jewish vector constituted
    by jew* vector. Latino vector constituted by latin* + mexic* + hispan* vectors.
    White vector constitued by white* vector.")
  ) %>%
  cols_label(
    ...1 = "Racial or Ethnic Category",
    Fox = "Fox News",
    Stormfront = "Stormfront",
    'Youtube AIN' = "Youtube AIN"
  ) %>%
  tab_options(table.font.size = px(15),
              source_notes.font.size = px(10),
              column_labels.font.size = px(18),
              heading.title.font.size = (20)) %>%
  opt_table_font(
    font = list(
      google_font(name = "Spectral")
    )
  )


# Below makes a table of Sources with 20 nearest neighbors to racial categories.
gt(broad_nns) %>%
  tab_header(
    title = md("**Table 3.** Nearest Neighbors: Broad Strokes of Race and Policy"),
    #subtitle = "hello world"
  ) %>%
  tab_source_note(
    source_note = md("**Note:** Top 20 nearest neighbors by cosine similarity displayed,
    in order of proximity.
    Politics and Policy vector constituted by politi* + policy vectors.
    Race and Ethnicity vector constituted by racist + racia* vectors. ")
  ) %>%
  cols_label(
    ...1 = "Broad Category",
    Fox = "Fox News",
    Stormfront = "Stormfront",
    'Youtube AIN' = "Youtube AIN"
  ) %>%
  tab_options(table.font.size = px(15),
              source_notes.font.size = px(10),
              column_labels.font.size = px(18),
              heading.title.font.size = (20)) %>%
  opt_table_font(
    font = list(
      google_font(name = "Spectral")
    )
  )










# # # Initialize an empty dataframe
# df_normed_coefficients <- data.frame()
# 
# # Select two source binary comparison
# test_source_tokens = toks_nostop_feats_sample[as.data.frame(docvars(toks_nostop_feats_sample))$source %in% c("Stormfront", "Fox")]
# 
# # Loop through the years
# for (year in 2005:2020) {
# 
#   # Subset the data for the given year
#   data_year = test_source_tokens[as.data.frame(docvars(test_source_tokens))$year == year]
# 
#   # Run the model with error handling
#   tryCatch({
#     model2 <- conText(formula = c("also", "because", "but") ~ source,
#                       data = data_year,
#                       pre_trained = local_glove_sample,
#                       transform = TRUE, transform_matrix = local_transform_sample,
#                       bootstrap = FALSE, num_bootstraps = 100,
#                       stratify = FALSE, jackknife = TRUE,
#                       confidence_level = 0.95,
#                       permute = TRUE, num_permutations = 100,
#                       window = 6L, case_insensitive = TRUE,
#                       hard_cut = FALSE, verbose = FALSE)
# 
#     # Append the year and normed_coefficients to the dataframe
#     normed_coef <- model2@normed_coefficients
#     normed_coef$year <- year
#     df_normed_coefficients <- rbind(df_normed_coefficients, normed_coef)
#   }, error = function(e) {
#     # Print the error message and continue with the next iteration
#     print(paste("Error in year", year, ":", e))
#   })
# }
# 
# # View the final dataframe
# print(df_normed_coefficients)
# 
# 
# # Function to determine the significance level
# get_significance <- function(p_value){
#   if (p_value < .001) return('***')
#   else if (p_value < .01) return('**')
#   else if (p_value < .05) return('*')
#   else return('')
# }
# # Create a new variable in your dataframe to hold the significance labels
# df_normed_coefficients$signif <- sapply(df_normed_coefficients$p.value, get_significance)
# 
# df_normed_coefficients_base <- df_normed_coefficients %>%
#   mutate(source = "Control")
# 
# base_plot <- #other plot aesthetics (with error lines instead of bars)
#   ggplot(df_normed_coefficients_base, aes(x=year, y=normed.estimate)) +
#   geom_line(linewidth = 1, color = "#0072B2") +
#   geom_point(size = 1, color = "#D55E00", shape = 21, fill = "#CC79A7") +
#   geom_line(aes(x = year, y = lower.ci), color = "gray60", size = 0.5 , linetype = "dotted") +
#   geom_line(aes(x = year, y = upper.ci), color = "gray60", size = 0.5, linetype = "dotted") +
#   geom_text(aes(label=signif), vjust=-1, color="black", size=4) +
#   labs(x="Year", y="Norm of β(hat)", title = "Christian-Pagan Divide Semantic Divide",
#        caption = "Table 1. Difference in meaning of terms 'also', 'because' and 'would'
#        between Republican and Democratic Parties
#        (Significance codes:  '***' 0.001 '**' 0.01 '*' 0.05)") +
#   theme_grey() +
#   theme(
#     plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
#     axis.title = element_text(face = "bold", size = 10),
#     axis.text = element_text(size = 7),
#     legend.position = "none"
#   )
# print(base_plot)
# 
# save(df_normed_coefficients_base, file = "df_normed_coefficients_base.RData")
load("df_normed_coefficients_base.RData")


# BASE PLOT 2 ----------------

# # Initialize an empty dataframe
# df_normed_coefficients2 <- data.frame()
# 
# # Select two source binary comparison
# test_source_tokens2 = toks_nostop_feats_sample2[as.data.frame(docvars(toks_nostop_feats_sample2))$source %in% c("Stormfront", "Fox")]
# 
# # Loop through the years
# for (year in 2005:2020) {
#   
#   # Subset the data for the given year
#   data_year2 = test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year == year]
#   
#   # Run the model with error handling
#   tryCatch({
#     model2 <- conText(formula = c("also", "because", "but") ~ source,
#                       data = data_year2,
#                       pre_trained = local_glove_sample,
#                       transform = TRUE, transform_matrix = local_transform_sample,
#                       bootstrap = FALSE, num_bootstraps = 100,
#                       stratify = FALSE, jackknife = TRUE,
#                       confidence_level = 0.95,
#                       permute = TRUE, num_permutations = 100,
#                       window = 6L, case_insensitive = TRUE,
#                       hard_cut = FALSE, verbose = FALSE)
#     
#     # Append the year and normed_coefficients to the dataframe
#     normed_coef <- model2@normed_coefficients
#     normed_coef$year <- year
#     df_normed_coefficients2 <- rbind(df_normed_coefficients2, normed_coef)
#   }, error = function(e) {
#     # Print the error message and continue with the next iteration
#     print(paste("Error in year", year, ":", e))
#   })
# }
# 
# # View the final dataframe
# print(df_normed_coefficients2)
# 
# 
# # Function to determine the significance level
# get_significance <- function(p_value){
#   if (p_value < .001) return('***')
#   else if (p_value < .01) return('**')
#   else if (p_value < .05) return('*')
#   else return('')
# }
# # Create a new variable in your dataframe to hold the significance labels
# df_normed_coefficients2$signif <- sapply(df_normed_coefficients2$p.value, get_significance)
# 
# df_normed_coefficients_base2 <- df_normed_coefficients2 %>%
#   mutate(source = "Control")
# 
# base_plot2 <- #other plot aesthetics (with error lines instead of bars)
#   ggplot(df_normed_coefficients_base2, aes(x=year, y=normed.estimate)) +
#   geom_line(linewidth = 1, color = "#0072B2") +
#   geom_point(size = 1, color = "#D55E00", shape = 21, fill = "#CC79A7") +
#   geom_line(aes(x = year, y = lower.ci), color = "gray60", size = 0.5 , linetype = "dotted") +
#   geom_line(aes(x = year, y = upper.ci), color = "gray60", size = 0.5, linetype = "dotted") +
#   geom_text(aes(label=signif), vjust=-1, color="black", size=4) +
#   labs(x="Year", y="Norm of β(hat)", title = "Christian-Pagan Divide Semantic Divide",
#        caption = "Table 1. Difference in meaning of terms 'also', 'because' and 'would'
#        between Republican and Democratic Parties
#        (Significance codes:  '***' 0.001 '**' 0.01 '*' 0.05)") +
#   theme_grey() +
#   theme(
#     plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
#     axis.title = element_text(face = "bold", size = 10),
#     axis.text = element_text(size = 7),
#     legend.position = "none"
#   )
# print(base_plot2)
# 
# save(df_normed_coefficients_base2, file = "df_normed_coefficients_base2.RData")
load("df_normed_coefficients_base2.RData")

# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# --------------------------EMBEDDING REGRESSION OVER TIME BETWEEN TWO PLATFORMS
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------

# Initialize an empty dataframe
df_normed_coefficients <- data.frame()

# Select two source binary comparison
test_source_tokens = toks_nostop_feats_sample[as.data.frame(docvars(toks_nostop_feats_sample))$source %in% c("Stormfront", "Fox")]

# Loop through the years
for (year in 2005:2020) {
  
  # Subset the data for the given year
  data_year = test_source_tokens[as.data.frame(docvars(test_source_tokens))$year == year]
  
  # Run the model with error handling
  tryCatch({
    model2 <- conText(formula = c("nation*", "countr*") ~ source,
                      data = data_year,
                      pre_trained = local_glove_sample2,
                      transform = TRUE, transform_matrix = local_transform_sample2,
                      bootstrap = FALSE, num_bootstraps = 100,
                      stratify = FALSE, jackknife = TRUE,
                      confidence_level = 0.95,
                      permute = TRUE, num_permutations = 100,
                      window = 6L, case_insensitive = TRUE,
                      hard_cut = FALSE, verbose = FALSE)
    
    # Append the year and normed_coefficients to the dataframe
    normed_coef <- model2@normed_coefficients
    normed_coef$year <- year
    df_normed_coefficients <- rbind(df_normed_coefficients, normed_coef)
  }, error = function(e) {
    # Print the error message and continue with the next iteration
    print(paste("Error in year", year, ":", e))
  })
}

# View the final dataframe
print(df_normed_coefficients)
print(df_normed_coefficients_base)

# Function to determine the significance level
get_significance <- function(p_value){
  if (p_value < .001) return('***')
  else if (p_value < .01) return('**')
  else if (p_value < .05) return('*')
  else return('') 
}
# Create a new variable in your dataframe to hold the significance labels
df_normed_coefficients$signif <- sapply(df_normed_coefficients$p.value, get_significance)

df_normed_coefficients <- df_normed_coefficients %>%
  mutate(source = "Test")
print(df_normed_coefficients)

test_experiment <- rbind(df_normed_coefficients_base, df_normed_coefficients)
# Reorder the 'source' factor levels
test_dif <- test_experiment %>%
  select(year, source, normed.estimate)
# Split the data into two subsets based on the 'source'
control_df <- subset(test_dif, source == "Control")
test_df <- subset(test_dif, source == "Test")
# Rename the columns for easier identification after merge
names(control_df)[3] <- "control_estimate"
names(test_df)[3] <- "test_estimate"
# Merge the dataframes by 'year'
merged_df <- merge(test_df, control_df, by = "year")
# Calculate the difference
merged_df$estimate_diff <- merged_df$test_estimate - merged_df$control_estimate
# Select the relevant columns
test_dif <- merged_df[, c("year", "estimate_diff")]
# Output the result
print(test_dif)


#other plot aesthetics (with error lines instead of bars)
# Modified plot aesthetics with lines broken up by "source"
test_plot <- ggplot(test_experiment, aes(x = year, y = normed.estimate, group = source, color = source)) +
  geom_line(linewidth = 1) +
  geom_point(size = 1, shape = 21, aes(fill = source)) +
  geom_line(aes(x = year, y = lower.ci), size = 0.5 , linetype = "dotted") +
  geom_line(aes(x = year, y = upper.ci), size = 0.5, linetype = "dotted") +
  geom_text(aes(label = signif), vjust = -1, color = "black", size = 2) +
  labs(x = "Year", y = "Norm of β(hat)", title = "Fox-Stormfront Embedding Regression: Country",
       caption = "Figure x.
       Comparision between embeddings of Stormfront and Fox News corpora.
       Control terms are 'also', 'because' and 'would.' Test term is 'republican', 'republicans', 'gop'.
       (Significance codes:  '***' 0.001 '**' 0.01 '*' 0.05)") +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
    axis.title = element_text(face = "bold", size = 10),
    axis.text = element_text(size = 7),
    #legend.position = "none"
  ) +
  scale_color_manual(values = c("#2F36BC", "#2FBC34"))# Replace with the colors you want for each source
print(test_plot) # plot differences over time between sense of scale

dif_plot <- ggplot(test_dif, aes(x = year, y = estimate_diff)) +
  geom_line(linewidth = 1) +
  labs(x = "Year", y = "Norm of β(hat) Above Control", title = "Fox-Stormfront Embedding Regression: Border",
       caption = "Figure x.
       Comparision between embeddings of Stormfront and Fox News corpora.
       Control terms are 'also', 'because' and 'would.' Test term is 'immigration', 'immigrants', 'immigrant'.
       (Significance codes:  '***' 0.001 '**' 0.01 '*' 0.05)") +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
    axis.title = element_text(face = "bold", size = 10),
    axis.text = element_text(size = 7),
    #legend.position = "none"
  )
print(dif_plot) 


# Assuming your first dataframe is named test_experiment and your second dataframe is named test_dif

# Split df1 into control and democrat data
control_data <- test_experiment %>% filter(source == "Control")
test_data <- test_experiment %>% filter(source == "Test")

# Merge the control and democrat data with test_dif
merged_data <- merge(control_data, test_data, by = "year", suffixes = c("_control", "_test"))
merged_data <- merge(merged_data, test_dif, by = "year")

# Function to determine significance based on non-overlapping CI
merged_data$signif <- mapply(function(lower_ci_control, upper_ci_control, lower_ci_test, upper_ci_test) {
  if (upper_ci_control < lower_ci_test | lower_ci_control > upper_ci_test) {
    return("*")
  } else {
    return(NA)
  }
}, merged_data$lower.ci_control, merged_data$upper.ci_control, merged_data$lower.ci_test, merged_data$upper.ci_test)

# Selecting relevant columns
result_df <- merged_data %>% select(year, estimate_diff, signif)

test_dif <- left_join(test_dif, result_df, by = "year")

# View the result
print(result_df)

# Modified combined plot
combined_plot <- ggplot() +
  geom_line(data = test_experiment, aes(x = year, y = normed.estimate, group = source, color = source), linewidth = 1) +
  geom_point(data = test_experiment, aes(x = year, y = normed.estimate, fill = source), size = 1, shape = 21) + # shape set as a constant
  geom_line(data = test_experiment, aes(x = year, y = lower.ci, group = source), size = 0.5 , linetype = "dotted") +
  geom_line(data = test_experiment, aes(x = year, y = upper.ci, group = source), size = 0.5, linetype = "dotted") +
  #geom_text(data = test_experiment, aes(x = year, y = normed.estimate, label = signif, group = source), vjust = -1, color = "black", size = 4) +
  geom_ribbon(data = test_dif, aes(x = year, ymax = ifelse(estimate_diff.x > 0, estimate_diff.x, 0), ymin = 0), fill = "darkred", alpha = 0.5) +
  geom_ribbon(data = test_dif, aes(x = year, ymax = 0, ymin = ifelse(estimate_diff.x < 0, estimate_diff.x, 0)), fill = "#2FBC34", alpha = 0.5) +
  geom_text(data = test_dif, aes(x = year, y = estimate_diff.x, label = signif), vjust = 1, color = "black", size = 2) +
  labs(x = "Year", y = "Norm of β(hat)", title = "Combined Fox-Stormfront Embedding Regression: Country",
       caption = "Combined figure.
       Control terms are 'also', 'because' and 'would.' Test terms vary.") +
  theme_bw() +
  theme(
    plot.title = element_text(face = "bold", hjust = 0.5, size = 12),
    axis.title = element_text(face = "bold", size = 10),
    axis.text = element_text(size = 7)
  ) +
  ylim(-0.5, 3) +
  xlim(2010, 2020) +
  scale_color_manual(values = c("grey", "#2FBC34")) # Adjust as needed

# Print the combined plot
print(combined_plot)




# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# --------------------------------EMBEDDING REGRESSION BEFORE AND AFTER ELECTION
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
set.seed(2021L)
# ------------------------------------------------------------------------------
# ---------------PRE-POST WITHIN SOURCE COMPARISONS-----------------------------
# ------------------------------------------------------------------------------

# Select two source binary comparison
test_source_tokens2 <- toks_nostop_feats_sample[as.data.frame(docvars(toks_nostop_feats_sample))$source %in% c("Fox")]
# and 2017-2021 (labeled as 1 for trump variable)
#test_source_tokens2 <- test_source_tokens2[!as.data.frame(docvars(test_source_tokens2))$year %in% c(2015, 2016)]
# then balance the two sides by five years 2010-2014 and 2017-2021
test_source_tokens2 <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year >= 2013]
test_source_tokens2 <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year <= 2016]

# get feats
feats <- featnames(dfm(test_source_tokens2))

# now make embedding test
model4 <- conText(formula = c("immigra*", "border*") ~ election_ctrl,
                  data = test_source_tokens2,
                  pre_trained = local_glove_sample2,
                  transform = TRUE, transform_matrix = local_transform_sample2,
                  bootstrap = FALSE, num_bootstraps = 100,
                  stratify = FALSE, jackknife = TRUE,
                  confidence_level = 0.95,
                  permute = TRUE, num_permutations = 100,
                  window = 6L, case_insensitive = TRUE,
                  hard_cut = FALSE, verbose = FALSE)


#---------- Now create figure for POLICY comparisions
pre_post_within_racial <- as.data.frame(read_csv("pre_post_within_racial - Sheet1.csv"))
pre_post_within_racial <- pre_post_within_racial %>%
  rename(p_value = 'p-value') %>%
  mutate(p_value_binary = ifelse(p_value < 0.05, "Yes", "No"))

pre_post_within_racial$p_value_binary <- as.factor(pre_post_within_racial$p_value_binary)

# Create the plot
ggplot(pre_post_within_racial, aes(x = racial_category, y = `Normed Estimate`, 
                 color = p_value_binary, shape = Treatment)) +
  geom_point(position = position_dodge(width = 0.25)) +  # Add points
  geom_errorbar(aes(ymin = `Lower CI`, ymax = `Upper CI`), width = 0.2, position = position_dodge(width = 0.25)) +
  facet_wrap(~ Source, ncol = 1) +  # Faceting by Source
  theme_minimal() +  # Minimal theme for cleaner look
  scale_shape_manual(values = c(16, 17)) +  # Manually set shapes for Treatment
  scale_color_manual(values = c("No" = "goldenrod", "Yes" = "black")) +  # Manually set colors
  labs(title = "Embedding Regression Estimates:
       2016 Election Cycle Exposure", 
       x = "Racial Category", 
       y = "Normed Estimate",
       color = "p-value < 0.05
       (empirical)",
       shape = "Treatment") +
  ylim(0,3) +
  coord_flip() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text.y = element_text(size = 8),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.background = element_rect(color = "darkgrey", linewidth = 0.25, linetype = "solid"),  # Adds a border
    strip.text = element_text(size = 12), # For facet labels
    plot.caption = element_text(size = 8, hjust = 0),
    plot.caption.position = "panel"
  )



#---------- Now create figure for POLICY comparisions

pre_post_within_policy <- as.data.frame(read_csv("pre_post_within_policy - Sheet1.csv"))
pre_post_within_policy <- pre_post_within_policy %>%
  rename(p_value = 'p-value') %>%
  mutate(p_value_binary = ifelse(p_value < 0.05, "Yes", "No"))

pre_post_within_policy$p_value_binary <- as.factor(pre_post_within_policy$p_value_binary)

# Create the plot
ggplot(pre_post_within_policy, aes(x = policy_category, y = `Normed Estimate`, 
                                   color = p_value_binary, shape = Treatment)) +
  geom_point(position = position_dodge(width = 0.25)) +  # Add points
  geom_errorbar(aes(ymin = `Lower CI`, ymax = `Upper CI`), width = 0.2, position = position_dodge(width = 0.25)) +
  facet_wrap(~ Source, ncol = 1) +  # Faceting by Source
  theme_minimal() +  # Minimal theme for cleaner look
  scale_shape_manual(values = c(16, 17)) +  # Manually set shapes for Treatment
  scale_color_manual(values = c("No" = "goldenrod", "Yes" = "black")) +  # Manually set colors
  labs(title = "Embedding Regression Estimates:
       2016 Election Cycle Exposure on Policy", 
       x = "Policy Category", 
       y = "Normed Estimate",
       color = "p-value < 0.05
       (empirical)",
       shape = "Treatment") +
  ylim(0,3) +
  coord_flip() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 12, face = "bold"),
    axis.text.y = element_text(size = 8),
    legend.title = element_text(size = 10),
    legend.text = element_text(size = 8),
    legend.background = element_rect(color = "darkgrey", linewidth = 0.25, linetype = "solid"),  # Adds a border
    strip.text = element_text(size = 12), # For facet labels
    plot.caption = element_text(size = 8, hjust = 0),
    plot.caption.position = "panel"
  )




# ------------------------------------------------------------------------------
# ---------------PRE-POST BETWEEN SOURCE COMPARISONS-----------------------------
# ------------------------------------------------------------------------------

# Here we only delineate between the election_test variable and compare between source

# Select two source binary comparison
test_source_tokens2 <- toks_nostop_feats_sample[as.data.frame(docvars(toks_nostop_feats_sample))$election_test == 0]
# and 2017-2021 (labeled as 1 for trump variable)
#test_source_tokens2 <- test_source_tokens2[!as.data.frame(docvars(test_source_tokens2))$year %in% c(2015, 2016)]
# then balance the two sides by five years 2013-2016
test_source_tokens2 <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year >= 2013]
test_source_tokens2 <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year <= 2016]

# get feats
feats <- featnames(dfm(test_source_tokens2))

# now make embedding test
model4 <- conText(formula = c("white", "whites", "caucasian") ~ source,
                  data = test_source_tokens2,
                  pre_trained = local_glove_sample2,
                  transform = TRUE, transform_matrix = local_transform_sample2,
                  bootstrap = FALSE, num_bootstraps = 100,
                  stratify = FALSE, jackknife = TRUE,
                  confidence_level = 0.95,
                  permute = TRUE, num_permutations = 100,
                  window = 6L, case_insensitive = TRUE,
                  hard_cut = FALSE, verbose = FALSE)



















# ------------------------------------------------------------------------------
# ---------------QUALITATIVE TESTS----------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# --------------------------------------NEAREST NEIGHBORS-----------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------

# Select two source binary comparison
test_source_tokens2 <- toks_nostop_feats_sample[as.data.frame(docvars(toks_nostop_feats_sample))$source %in% c("Fox")]
# and 2017-2021 (labeled as 1 for trump variable)
#test_source_tokens2 <- test_source_tokens2[!as.data.frame(docvars(test_source_tokens2))$year %in% c(2015, 2016)]
# then balance the two sides by five years 2010-2014 and 2017-2021
test_source_tokens2 <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year >= 2013]
test_source_tokens2 <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$year <= 2016]

# # make feats for nns test
feats_nns <- featnames(dfm(test_source_tokens2))


fox_nns <- get_nns(x = test_source_tokens2, N = 20,
               groups = docvars(test_source_tokens2, 'election_test'),
               candidates = feats_nns,
               pre_trained = local_glove_sample2,
               transform = TRUE,
               transform_matrix = local_transform_sample2,
               bootstrap = FALSE,
               num_bootstraps = 100,
               confidence_level = 0.95,
               as_list = TRUE)
print(fox_nns)

# # make sf
# test_source_tokens2_sf <- test_source_tokens2[as.data.frame(docvars(test_source_tokens2))$source == "Stormfront"]
# sf_feats <- featnames(dfm(test_source_tokens2_sf))
# sf_nns <- get_nns(x = test_source_tokens2_sf, N = 20,
#                    groups = docvars(test_source_tokens2_sf, 'trump'),
#                    candidates = sf_feats,
#                    pre_trained = local_glove_sample,
#                    transform = TRUE,
#                    transform_matrix = local_transform_sample,
#                    bootstrap = FALSE,
#                    num_bootstraps = 100, 
#                    confidence_level = 0.95,
#                    as_list = TRUE)
# print(sf_nns)

# make comparative one
nns <- get_nns(x = test_source_tokens2, N = 20,
               groups = docvars(test_source_tokens2, 'source'),
               candidates = feats,
               pre_trained = local_glove_sample,
               transform = TRUE,
               transform_matrix = local_transform_sample,
               bootstrap = FALSE,
               num_bootstraps = 100, 
               confidence_level = 0.95,
               as_list = TRUE)
print(nns)






# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# --------------------------------------NEAREST NEIGHBOR COSINE SIMILARITY RATIO
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------
# ------------------------------------------------------------------------------

# for overall comparision (takes a while)
test_source_tokens2 <- toks_nostop_feats_sample[as.data.frame(docvars(toks_nostop_feats_sample))$source %in% c("Stormfront", "Fox")]
feats_csr <- featnames(dfm(test_source_tokens2))

# for focused comparison
bank_toks2 <- tokens_context(x = test_source_tokens2, pattern = c("latino", "hispanic", "mexican"), window = 6L)
# we limit candidates to features in our corpus
feats2 <- featnames(dfm(bank_toks2))

# for fox
test_source_tokens2_fox_nncsr <- bank_toks2[as.data.frame(docvars(bank_toks2))$source == "Fox"]
feats2_fox_nncsr <- featnames(dfm(bank_toks2))
# for sf
test_source_tokens2_sf_nncsr <- bank_toks2[as.data.frame(docvars(bank_toks2))$source == "Stormfront"]
feats2_sf_nncsr <- featnames(dfm(bank_toks2))

# compute ratio
set.seed(2021L)
immig_nns_ratio <- get_nns_ratio(x = test_source_tokens2_sf_nncsr, 
                                 N = 20,
                                 groups = docvars(test_source_tokens2_sf_nncsr, 'election_test'),
                                 numerator = "1",
                                 candidates = feats2_fox_nncsr,
                                 pre_trained = local_glove_sample2,
                                 transform = TRUE,
                                 transform_matrix = local_transform_sample2,
                                 bootstrap = TRUE,
                                 num_bootstraps = 100,
                                 permute = TRUE,
                                 num_permutations = 100,
                                 verbose = FALSE)
plot_nns_ratio(x = immig_nns_ratio, alpha = 0.05, horizontal = FALSE)









